se <- maxQuant("proteinGroups_thorax.xlsx", intensity = "LFQ", type = "xlsx",
sheet = "filtered")
## load annotation
cD <- openxlsx::read.xlsx("proteinGroups_thorax.xlsx", sheet = "annotation_filtered")
cD[, "Sample_IDs"] <- make.names(cD[, "Sample_IDs"])
colnames(cD)[colnames(cD) == "Sample_IDs"] <- "name"
## truncate se and write cD to colData(se)
se <- se[, cD$name]
se@colData <- cD %>% DataFrame()
colnames(se) <- se$name
## truncate colnames
colnames(se) <- stringr::str_replace(colnames(se),
"LFQ.intensity.1_KL_ProtMet_IO_40min_DDA_", "ProtMet_")
colnames(se) <- stringr::str_replace(colnames(se),
"LFQ.intensity.5_KL_nWorkFlow_IO_40min_DDA_", "nWorkFlow_")
colnames(se) <- stringr::str_replace(colnames(se),
"LFQ.intensity.5_KL_nWorkFlowt_IO_40min_DDA_", "nWorkFlow")
se$name <- colnames(se)Exclude the following samples:
ProtMet_8U04FR_NG_14_B10_1_6020,
ProtMet_IJ17TV_NG_12_B8_1_6016.
se <- MatrixQCvis:::selectSampleSE(se,
selection = c("ProtMet_8U04FR_NG_14_B10_1_6020", "ProtMet_IJ17TV_NG_12_B8_1_6016"),
mode = "exclude")Translate Uniprot into Symbol.
library("org.Hs.eg.db")
uniprots <- Rkeys(org.Hs.egUNIPROT)
dict <- AnnotationDbi::select(org.Hs.eg.db, uniprots, "SYMBOL", "UNIPROT")
uniprot <- dict$UNIPROT
symbol <- dict$SYMBOL
names(symbol) <- uniprot
## rename
rownames(se) <- unlist(lapply(rownames(se),
function(x) paste(symbol[strsplit(x, split = ";")[[1]]], collapse = ";")))
## remove the features that have no corresponding Symbol
exclude_pattern <- c("NA", "NA;NA", "NA;NA;NA", "NA;NA;NA;NA",
"NA;NA;NA;NA;NA", "NA;NA;NA;NA;NA;NA", "NA;NA;NA;NA;NA;NA;NA")
se <- se[!rownames(se) %in% exclude_pattern, ]Perform log-transformation on the data set.
## keep features with more than ca. 60% (18/35) measured values
se <- se[rowSums(!is.na(assay(se))) >= 18 , ]
dim(se)## [1] 3010 35
cD <- colData(se)
cD$Type_Processing <- paste(cD$Type, cD$Processing, sep = "_")
cD$Individual <- stringr::str_remove(cD$Pseudonym, pattern = "_NG|_TU")
design <- model.matrix(~ 0 + Type_Processing, data = cD)
colnames(design) <- make.names(colnames(design))
cor <- limma::duplicateCorrelation(assay(se), design, block=cD$Individual)
fit <- lmFit(object = assay(se), design = design, block = cD$Individual,
correlation = cor$consensus)## Warning: Partial NA coefficients for 57 probe(s)
## create contrasts
contrasts <- makeContrasts(
autoSP3vsMTBE_SP3 = (Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3)/2 -
(Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3)/2,
TUvsNG = (Type_ProcessingTU_autoSP3 - Type_ProcessingTU_MTBE_SP3)/2 -
(Type_ProcessingNG_autoSP3 - Type_ProcessingNG_MTBE_SP3)/2, ## identical to autoSP3vsMTBE_SP3
NG_autoSP3vsMTBE_SP3 = Type_ProcessingNG_autoSP3 - Type_ProcessingNG_MTBE_SP3,
TU_autoSP3vsMTBE_SP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingTU_MTBE_SP3,
autoSP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3,
MTBE_SP3 = Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3,
levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)
## set parameters for differential expression
num <- Inf
p_val <- 1
adj <- "BH"We test here the DE proteins between NG and TU and compare the results that we get from the two extraction methods.
Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.
tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)## [1] 2
We test here the DE proteins from the two extraction methods looking only at NG.
Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.
tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "NG_autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)## [1] 809
We test here the DE proteins from the two extraction methods looking only at TU
Ideally, this should result in few DE proteins. This would indicate that the methods yield similar results.
tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "TU_autoSP3vsMTBE_SP3")
rmarkdown::paged_table(tT)## [1] 553
tT_TU <- tT
write.table(tT, file = "proteomics_DE_t_TU_autoSP3vsMTBE_SP3.txt",
quote = FALSE, sep = "\t")Ok, this is interesting, when looking at the individual tissues, there are quite many differences, but the differences “vanish” when we combine the different conditions (autoSP3 vs. MTBE_SP3).
We will continue and check the overlap of DE proteins between the contrasts
- autoSP3 vs. MTBE_SP3 (NG)
- autoSP3 vs. MTBE_SP3 (TU)
- autoSP3 vs. MTBE_SP3 (TU-NG)
## only continue with the significant ones
NG_sign <- tT_NG[tT_NG$adj.P.Val < 0.05 & !is.na(tT_NG$adj.P.Val), ]
dim(NG_sign)## [1] 809 8
## [1] 553 8
autoSP3vsMTBE_sign <- tT_autoSP3vsMTBE[tT_autoSP3vsMTBE$adj.P.Val < 0.05 & !is.na(tT_autoSP3vsMTBE$adj.P.Val), ]
dim(autoSP3vsMTBE_sign)## [1] 2 8
## create list
l <- list(NG = rownames(NG_sign), TU = rownames(TU_sign),
autoSP3vsMTBE_SP3 = rownames(autoSP3vsMTBE_sign))
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")pdf("UpSet_NAT_TT_autoSP3vsMTBESP3.pdf")
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")
dev.off()## png
## 2
We will continue and check the overlap of DE proteins between the contrasts:
- TU vs. NG (autoSP3)
- TU vs. NG (MTBE_SP3)
## autoSP3
tT_autoSP3 <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "autoSP3")
sum(tT_autoSP3$adj.P.Val < 0.05, na.rm = TRUE)## [1] 1386
write.table(tT_autoSP3, file = "proteomics_DE_t_TUvsNG_autoSP3.txt",
quote = FALSE, sep = "\t")
## MTBE_SP3
tT_MTBE_SP3 <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "MTBE_SP3")
rmarkdown::paged_table(tT_MTBE_SP3)## [1] 1382
write.table(tT_MTBE_SP3, file = "proteomics_DE_t_TUvsNG_MTBE_SP3.txt",
quote = FALSE, sep = "\t")
## only continue with the significant ones
autoSP3_sign <- tT_autoSP3[tT_autoSP3$adj.P.Val < 0.05 & !is.na(tT_autoSP3$adj.P.Val), ]
MTBE_SP3_sign <- tT_MTBE_SP3[tT_MTBE_SP3$adj.P.Val < 0.05 & !is.na(tT_MTBE_SP3$adj.P.Val), ]
## create list
l <- list(autoSP3 = autoSP3_sign$ID, MTBE_SP3 = MTBE_SP3_sign$ID)
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")pdf("UpSet_NATvsTT_autoSP3_MTBESP3.pdf")
UpSetR::upset(UpSetR::fromList(l), order.by = "freq")
dev.off()## png
## 2
## Can we say anything about the magnitude of the difference? E.g. the
## overlapping 1004 proteins show on average a larger fold change than those
## detected by only one of the methods? Show in a box plot.
mtbe_auto_shared <- l$autoSP3[l$autoSP3 %in% l$MTBE_SP3]
mtbe_unique <- l$MTBE_SP3[!(l$MTBE_SP3 %in% l$autoSP3)]
auto_unique <- l$autoSP3[!(l$autoSP3 %in% l$MTBE_SP3)]
df_mtbe_shared <- data.frame(
logFC = tT_MTBE_SP3[tT_MTBE_SP3$ID %in% mtbe_auto_shared, "logFC"],
set = "shared", type = "MTBE-SP3")
df_auto_shared <- data.frame(
logFC = tT_autoSP3[tT_autoSP3$ID %in% mtbe_auto_shared, "logFC"],
set = "shared", type = "autoSP3")
df_mtbe_unique <- data.frame(
logFC = tT_MTBE_SP3[tT_MTBE_SP3$ID %in% mtbe_unique, "logFC"],
set = "unique", type = "MTBE-SP3")
df_auto_unique <- data.frame(
logFC = tT_autoSP3[tT_autoSP3$ID %in% auto_unique, "logFC"],
set = "unique", type = "autoSP3")
df_all <- rbind(df_mtbe_shared, df_auto_shared, df_mtbe_unique, df_auto_unique)
g <- ggplot(df_all) +
geom_beeswarm(aes(y = logFC, x = type), size = 0.5, alpha = 0.5) +
facet_wrap(~ set + type, ncol = 4, scales = "free_x") +
theme_classic()
ggsave(filename = "BeeSwarm_NATvsTT_autoSP3_MTBESP3.pdf", plot = g)## Saving 7 x 5 in image
df_tmp <- df_all
df_tmp$logFC <- abs(df_tmp$logFC)
wilcox.test(
df_tmp[df_tmp$set == "shared" & df_tmp$type == "autoSP3", "logFC"],
df_tmp[df_tmp$set == "unique" & df_tmp$type == "autoSP3", "logFC"],
alternative = "greater")##
## Wilcoxon rank sum test with continuity correction
##
## data: df_tmp[df_tmp$set == "shared" & df_tmp$type == "autoSP3", "logFC"] and df_tmp[df_tmp$set == "unique" & df_tmp$type == "autoSP3", "logFC"]
## W = 239420, p-value = 4.105e-13
## alternative hypothesis: true location shift is greater than 0
wilcox.test(
df_tmp[df_tmp$set == "shared" & df_tmp$type == "MTBE-SP3", "logFC"],
df_tmp[df_tmp$set == "unique" & df_tmp$type == "MTBE-SP3", "logFC"],
alternative = "greater")##
## Wilcoxon rank sum test with continuity correction
##
## data: df_tmp[df_tmp$set == "shared" & df_tmp$type == "MTBE-SP3", "logFC"] and df_tmp[df_tmp$set == "unique" & df_tmp$type == "MTBE-SP3", "logFC"]
## W = 230510, p-value = 3.588e-10
## alternative hypothesis: true location shift is greater than 0
## TU vs NG. (taking into account baseline autoSP3/MTBE-SP3)
tT <- topTable(fit_eB, number = num, p.value = p_val, adjust.method = adj,
coef = "TUvsNG")
rmarkdown::paged_table(tT)tT_TUvsNG <- tT
## plot t-values
df <- data.frame(vals_x = tT_autoSP3[rownames(tT_autoSP3), "t"],
vals_y = tT_MTBE_SP3[rownames(tT_autoSP3), "t"],
vals_z = tT_TUvsNG[rownames(tT_autoSP3), "t"])
rownames(df) <- rownames(tT_autoSP3)
## t-values(autoSP3) vs t-values (MTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_x", y = "vals_y")) +
geom_point(aes(alpha = 0.7)) +
xlab("t-values (autoSP3)") + ylab("t-values (MTBE-SP3)") +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
ggtitle("NAT vs. TT") +
theme_classic() +
theme(legend.position = "none")## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Warning: Removed 57 rows containing missing values (`geom_point()`).
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
cor.test(tT_autoSP3[rownames(tT_autoSP3), "t"],
tT_MTBE_SP3[rownames(tT_autoSP3), "t"], method = "spearman")##
## Spearman's rank correlation rho
##
## data: tT_autoSP3[rownames(tT_autoSP3), "t"] and tT_MTBE_SP3[rownames(tT_autoSP3), "t"]
## S = 711496290, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8342194
## t-values(autoSP3) vs t-values (AutoSP3vsMTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_x", y = "vals_z")) +
geom_point(aes(alpha = 0.7)) +
xlab("t-values (autoSP3)") + ylab("t-values (autoSP3/MTBE-SP3)") +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
ggtitle("NAT vs. TT") +
theme_classic() +
theme(legend.position = "none")
g ## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
## t-values(MTBE-SP3) vs t-values (AutoSP3vsMTBE-SP3)
g <- ggplot(df, aes_string(x = "vals_y", y = "vals_z")) +
geom_point(aes(alpha = 0.7)) +
xlab("t-values (MTBE-SP3)") + ylab("t-values (autoSP3/MTBE-SP3)") +
ggpubr::stat_cor(method = "spearman", cor.coef.name = "rho") +
ggtitle("NAT vs. TT") +
theme_classic() +
theme(legend.position = "none")
g ## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
## Saving 7 x 5 in image
## Warning: Removed 57 rows containing non-finite values (`stat_cor()`).
## Removed 57 rows containing missing values (`geom_point()`).
Run now the GO enrichment tests. Use the raw (unadjusted) values and return the GO terms for the ontologies Biological Process (BP), Molecular Function (MF) and Cellular Component (CC). Perform a Fisher test with the significant features (\(\alpha\) < 0.05).
For autoSP3.
## [1] "## BP"
## [1] "## MF"
## [1] "## CC"
## [1] "## BP mitochondria:"
## [1] 7
## [1] "## MF mitochondria:"
## [1] 0
## [1] "## CC mitochondria:"
## [1] 1
Run now the GO enrichment tests. Use the raw (unadjusted) values and return the GO terms for the ontologies Biological Process (BP), Molecular Function (MF) and Cellular Component (CC). Perform a Fisher test with the significant features (\(\alpha\) < 0.05).
For MTBE-SP3.
## [1] "## BP"
## [1] "## MF"
## [1] "## CC"
## [1] "## BP mitochondria:"
## [1] 0
## [1] "## MF mitochondria:"
## [1] 0
## [1] "## CC mitochondria:"
## [1] 0
cD <- colData(se)
cD$Type_Processing <- paste(cD$Type, cD$Processing, sep = "_")
cD$Individual <- stringr::str_remove(cD$Pseudonym, pattern = "_NG|_TU")
design <- model.matrix(~ 0 + Type_Processing + Individual, data = cD)
fit <- lmFit(object = assay(se), design = design)## Warning: Partial NA coefficients for 592 probe(s)
## create contrasts
contrasts <- makeContrasts(
autoSP3 = Type_ProcessingTU_autoSP3 - Type_ProcessingNG_autoSP3,
MTBE_SP3 = Type_ProcessingTU_MTBE_SP3 - Type_ProcessingNG_MTBE_SP3,
levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)
## set parameters for differential expression
num <- Inf
p_val <- 1
adj <- "BH"
## get the features for autoSP3
tT_autoSP3_covariate <- topTable(fit_eB, number = num, p.value = p_val,
adjust.method = adj, coef = "autoSP3")
cor(x = tT_autoSP3_covariate$t[order(rownames(tT_autoSP3_covariate))],
y = tT_autoSP3$t[order(rownames(tT_autoSP3))],
use = "pairwise.complete.obs")## [1] 0.9756756
## get the features for MTBE_SP3
tT_MTBE_SP3_covariate <- topTable(fit_eB, number = num, p.value = p_val,
adjust.method = adj, coef = "MTBE_SP3")
cor(x = tT_MTBE_SP3_covariate$t[order(rownames(tT_MTBE_SP3_covariate))],
y = tT_MTBE_SP3$t[order(rownames(tT_MTBE_SP3))],
use = "pairwise.complete.obs")## [1] 0.9725197
We test here the effect of tissue heterogeneity within the autoSP3 and MTBE-SP3 samples. Therefore, we randomly select 50% of the samples from within autoSP3 and MTBE-SP3 samples and run linear models within each extraction method.
In case of tissue heterogeneity, there should be some proteins that are differentially abundant.
se$Patient_ID <- stringr::str_remove(se$Pseudonym, pattern = "_TU|_NG")
##
## MTBE-SP3
se_mtbesp3 <- se[, se$Processing == "MTBE_SP3"]
ids_mtbesp3 <- se_mtbesp3$Patient_ID |> unique()
set.seed(2023)
ids_group1 <- sample(ids_mtbesp3, size = floor(length(ids_mtbesp3)/ 2),
replace = FALSE)
se_mtbesp3$random_group <- ifelse(se_mtbesp3$Patient_ID %in% ids_group1, "1", "2")
## run the linear model, adjust for the tissue effect
design <- model.matrix(~ 0 + random_group + Type, data = colData(se_mtbesp3))
colnames(design) <- make.names(colnames(design))
fit <- lmFit(object = assay(se_mtbesp3), design = design)## Warning: Partial NA coefficients for 17 probe(s)
## make contrasts
contrasts <- makeContrasts(
group1_vs_group2 = random_group1 - random_group2,
levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)
## get the features for MTBE-SP3
tT_MTBESP3 <- topTable(fit_eB, number = num, p.value = p_val,
adjust.method = adj)
sum(tT_MTBESP3[, "adj.P.Val"] < 0.05)## [1] NA
##
## autoSP3
se_autosp3 <- se[, se$Processing == "autoSP3"]
ids_autosp3 <- se_autosp3$Patient_ID |> unique()
set.seed(2023)
ids_group1 <- sample(ids_autosp3, size = floor(length(ids_autosp3)/ 2),
replace = FALSE)
se_autosp3$random_group <- ifelse(se_autosp3$Patient_ID %in% ids_group1, "1", "2")
## run the linear model, adjust for the tissue effect
design <- model.matrix(~ 0 + random_group + Type , data = colData(se_autosp3))
colnames(design) <- make.names(colnames(design))
fit <- lmFit(object = assay(se_autosp3), design = design)## Warning: Partial NA coefficients for 46 probe(s)
## make contrasts
contrasts <- makeContrasts(
group1_vs_group2 = random_group1 - random_group2,
levels = design)
fit_c <- contrasts.fit(fit, contrasts)
fit_eB <- eBayes(fit_c)
## get the features for MTBE-SP3
tT_autoSP3 <- topTable(fit_eB, number = num, p.value = p_val,
adjust.method = adj)
sum(tT_autoSP3[, "adj.P.Val"] < 0.05)## [1] 0
European Molecular Biology Laboratory, Meyerhofstrasse 1, 69117 Heidelberg, Germany↩︎